# 学习课程：中文文本挖掘
# 学习学生：姜浩然

import os
import math
import shutil
import jieba
import pandas as pd
import jieba.analyse


def gte_txt():
    raw = pd.read_csv(r"D:\python\金庸-射雕英雄传txt精校版.txt",
                      names=['txt'], sep='aaa', encoding='gbk', engine='python')

    def m_head(tmpstr):
        return tmpstr[:1]

    def m_mid(tmpstr):
        return tmpstr.find("回 ")

    raw['head'] = raw.txt.apply(m_head)
    raw['mid'] = raw.txt.apply(m_mid)
    raw['len'] = raw.txt.apply(len)
    chapnum = 0
    for i in range(len(raw)):
        if raw['head'][i]=='第' and raw['mid'][i]>0 and raw['len'][i]<30:
            chapnum += 1
        if chapnum >= 40 and raw['txt'][i] == "附录一：成吉思汗家族" :
            chapnum = 0
        raw.loc[i, 'chap'] = chapnum
    return raw

# 判断章节
def chap_num(raw,num):
    tmpchap = raw[raw['chap'] == num].copy()
    chapter = tmpchap.sum()
    return chapter.txt

# 使用停用词对章节进行分词
def stop_words():
    stopwords = []
    with open(r"D:\python\停用词.txt", 'r', encoding='utf-8') as f:
        for line in f:
            if len(line)>0:
                stopwords.append(line.strip())
    return stopwords

# 将每一回分词的结果写在文档中
def get_dict(res):
    total = 0
    all_dict = {}
    for content in res:
        word_cut = jieba.cut(content.strip())
        outstr = []
        for word in word_cut:
            if word not in stop_words():
                if word != '\t' and word != '\n':
                    outstr.append(word)
        for word in outstr:
            if word == ' ':
                outstr.remove(' ')
        dict = {}
        total += 1
        for word in outstr:
            dict[word] = 1
        for key in dict:
            count = all_dict.get(key, 0)
            all_dict[key] = count + 1
    return all_dict

# 构造字典
def work(result):
    all_dict = get_dict(result)
    idf_dict = {}
    for key in all_dict:
        s = '%.10f' % (math.log10(40 / (all_dict[key] + 1)))
        if key > u'\u4e00' and key <= u'\u9fa5':  #通过if判断语句，保证字典的key都是汉字
            idf_dict[key] = s
    print('字典构造结束')
    f = open(r"D:\python\idf语料库.txt", 'w', encoding='utf-8')
    for i in idf_dict:
        if i != '\n':
            f.write(i + ' ' + idf_dict[i] + '\n')
    f.close()

# 创建目录
def make_dirs(path):
    isExists = os.path.exists(path)
    if not isExists:
        # 如果不存在则创建目录
        # 创建目录操作函数
        os.makedirs(path)
        print(path + ' 创建成功')
    else:
        # 如果目录存在则提示目录已存在,删除并重建
        print(path + ' 目录已存在')
        shutil.rmtree(path)
        print("成功删除原有文件夹！")
        os.mkdir(path)
        print("成功创建文件夹！")
    lists = []
    for i in range(0, 40):
        content = chap_num(gte_txt(), i + 1)
        with open(path + os.sep + "第" + str(i + 1) + "章.txt", 'w') as f:
            f.write(chap_num(gte_txt(), i + 1))
            print("第" + str(i+1) + "章导入成功！")
        lists.append(content)
    return lists


if __name__ == '__main__':
    path = './射雕英雄传集'
    lists = make_dirs(path)
    work(lists)
    with open(r"D:\python\金庸-射雕英雄传txt精校版.txt",encoding='gbk') as f:
        jieba.load_userdict(f)       # 净化分词效果
    jieba.analyse.set_stop_words(r"D:\python\停用词.txt")  # 在TFIDF计算中直接应用停用词表
    jieba.analyse.set_idf_path(r"D:\python\idf语料库.txt")   # 使用自定义TF-IDF频率文件
    TFres1 = jieba.analyse.extract_tags(lists[0], withWeight=True,topK=10)
    print(TFres1)

